Data Analysis

Comprehensive Data Cleaning & Exploratory Analysis of Job Market Trends

Authors
Affiliation

Norah Jones

Boston University

John Hamm

Boston University

import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px


df = pd.read_csv("lightcast_job_postings.csv")  # Make sure this is your converted file
df.columns = df.columns.str.strip().str.lower()  # Normalize column names

print(df.columns.tolist())  # You should now see: 'lightcast_sectors_name'
/tmp/ipykernel_2053/2709272696.py:7: DtypeWarning: Columns (19,30) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv("lightcast_job_postings.csv")  # Make sure this is your converted file
['id', 'last_updated_date', 'last_updated_timestamp', 'duplicates', 'posted', 'expired', 'duration', 'source_types', 'sources', 'url', 'active_urls', 'active_sources_info', 'title_raw', 'body', 'modeled_expired', 'modeled_duration', 'company', 'company_name', 'company_raw', 'company_is_staffing', 'education_levels', 'education_levels_name', 'min_edulevels', 'min_edulevels_name', 'max_edulevels', 'max_edulevels_name', 'employment_type', 'employment_type_name', 'min_years_experience', 'max_years_experience', 'is_internship', 'salary', 'remote_type', 'remote_type_name', 'original_pay_period', 'salary_to', 'salary_from', 'location', 'city', 'city_name', 'county', 'county_name', 'msa', 'msa_name', 'state', 'state_name', 'county_outgoing', 'county_name_outgoing', 'county_incoming', 'county_name_incoming', 'msa_outgoing', 'msa_name_outgoing', 'msa_incoming', 'msa_name_incoming', 'naics2', 'naics2_name', 'naics3', 'naics3_name', 'naics4', 'naics4_name', 'naics5', 'naics5_name', 'naics6', 'naics6_name', 'title', 'title_name', 'title_clean', 'skills', 'skills_name', 'specialized_skills', 'specialized_skills_name', 'certifications', 'certifications_name', 'common_skills', 'common_skills_name', 'software_skills', 'software_skills_name', 'onet', 'onet_name', 'onet_2019', 'onet_2019_name', 'cip6', 'cip6_name', 'cip4', 'cip4_name', 'cip2', 'cip2_name', 'soc_2021_2', 'soc_2021_2_name', 'soc_2021_3', 'soc_2021_3_name', 'soc_2021_4', 'soc_2021_4_name', 'soc_2021_5', 'soc_2021_5_name', 'lot_career_area', 'lot_career_area_name', 'lot_occupation', 'lot_occupation_name', 'lot_specialized_occupation', 'lot_specialized_occupation_name', 'lot_occupation_group', 'lot_occupation_group_name', 'lot_v6_specialized_occupation', 'lot_v6_specialized_occupation_name', 'lot_v6_occupation', 'lot_v6_occupation_name', 'lot_v6_occupation_group', 'lot_v6_occupation_group_name', 'lot_v6_career_area', 'lot_v6_career_area_name', 'soc_2', 'soc_2_name', 'soc_3', 'soc_3_name', 'soc_4', 'soc_4_name', 'soc_5', 'soc_5_name', 'lightcast_sectors', 'lightcast_sectors_name', 'naics_2022_2', 'naics_2022_2_name', 'naics_2022_3', 'naics_2022_3_name', 'naics_2022_4', 'naics_2022_4_name', 'naics_2022_5', 'naics_2022_5_name', 'naics_2022_6', 'naics_2022_6_name']
columns_to_drop = [
    "id", "url", "active_urls", "duplicates", "last_updated_timestamp",
    "naics2", "naics3", "naics4", "naics5", "naics6",
    "soc_2", "soc_3", "soc_5"
]
df.drop(columns=columns_to_drop, inplace=True)
df.head()
last_updated_date posted expired duration source_types sources active_sources_info title_raw body modeled_expired ... naics_2022_2 naics_2022_2_name naics_2022_3 naics_2022_3_name naics_2022_4 naics_2022_4_name naics_2022_5 naics_2022_5_name naics_2022_6 naics_2022_6_name
0 9/6/2024 6/2/2024 6/8/2024 6.0 [\n "Company"\n] [\n "brassring.com"\n] NaN Enterprise Analyst (II-III) 31-May-2024\n\nEnterprise Analyst (II-III)\n\n... 6/8/2024 ... 44.0 Retail Trade 441.0 Motor Vehicle and Parts Dealers 4413.0 Automotive Parts, Accessories, and Tire Retailers 44133.0 Automotive Parts and Accessories Retailers 441330.0 Automotive Parts and Accessories Retailers
1 8/2/2024 6/2/2024 8/1/2024 NaN [\n "Job Board"\n] [\n "maine.gov"\n] NaN Oracle Consultant - Reports (3592) Oracle Consultant - Reports (3592)\n\nat SMX i... 8/1/2024 ... 56.0 Administrative and Support and Waste Managemen... 561.0 Administrative and Support Services 5613.0 Employment Services 56132.0 Temporary Help Services 561320.0 Temporary Help Services
2 9/6/2024 6/2/2024 7/7/2024 35.0 [\n "Job Board"\n] [\n "dejobs.org"\n] NaN Data Analyst Taking care of people is at the heart of every... 6/10/2024 ... 52.0 Finance and Insurance 524.0 Insurance Carriers and Related Activities 5242.0 Agencies, Brokerages, and Other Insurance Rela... 52429.0 Other Insurance Related Activities 524291.0 Claims Adjusting
3 9/6/2024 6/2/2024 7/20/2024 48.0 [\n "Job Board"\n] [\n "disabledperson.com",\n "dejobs.org"\n] NaN Sr. Lead Data Mgmt. Analyst - SAS Product Owner About this role:\n\nWells Fargo is looking for... 6/12/2024 ... 52.0 Finance and Insurance 522.0 Credit Intermediation and Related Activities 5221.0 Depository Credit Intermediation 52211.0 Commercial Banking 522110.0 Commercial Banking
4 6/19/2024 6/2/2024 6/17/2024 15.0 [\n "FreeJobBoard"\n] [\n "craigslist.org"\n] NaN Comisiones de $1000 - $3000 por semana... Comi... Comisiones de $1000 - $3000 por semana... Comi... 6/17/2024 ... 99.0 Unclassified Industry 999.0 Unclassified Industry 9999.0 Unclassified Industry 99999.0 Unclassified Industry 999999.0 Unclassified Industry

5 rows × 118 columns

# Visualize missing values
msno.heatmap(df)
plt.title("Missing Values Heatmap")
plt.show()

for col in df.columns:
    print(repr(col))
'last_updated_date'
'posted'
'expired'
'duration'
'source_types'
'sources'
'active_sources_info'
'title_raw'
'body'
'modeled_expired'
'modeled_duration'
'company'
'company_name'
'company_raw'
'company_is_staffing'
'education_levels'
'education_levels_name'
'min_edulevels'
'min_edulevels_name'
'max_edulevels'
'max_edulevels_name'
'employment_type'
'employment_type_name'
'min_years_experience'
'max_years_experience'
'is_internship'
'salary'
'remote_type'
'remote_type_name'
'original_pay_period'
'salary_to'
'salary_from'
'location'
'city'
'city_name'
'county'
'county_name'
'msa'
'msa_name'
'state'
'state_name'
'county_outgoing'
'county_name_outgoing'
'county_incoming'
'county_name_incoming'
'msa_outgoing'
'msa_name_outgoing'
'msa_incoming'
'msa_name_incoming'
'naics2_name'
'naics3_name'
'naics4_name'
'naics5_name'
'naics6_name'
'title'
'title_name'
'title_clean'
'skills'
'skills_name'
'specialized_skills'
'specialized_skills_name'
'certifications'
'certifications_name'
'common_skills'
'common_skills_name'
'software_skills'
'software_skills_name'
'onet'
'onet_name'
'onet_2019'
'onet_2019_name'
'cip6'
'cip6_name'
'cip4'
'cip4_name'
'cip2'
'cip2_name'
'soc_2021_2'
'soc_2021_2_name'
'soc_2021_3'
'soc_2021_3_name'
'soc_2021_4'
'soc_2021_4_name'
'soc_2021_5'
'soc_2021_5_name'
'lot_career_area'
'lot_career_area_name'
'lot_occupation'
'lot_occupation_name'
'lot_specialized_occupation'
'lot_specialized_occupation_name'
'lot_occupation_group'
'lot_occupation_group_name'
'lot_v6_specialized_occupation'
'lot_v6_specialized_occupation_name'
'lot_v6_occupation'
'lot_v6_occupation_name'
'lot_v6_occupation_group'
'lot_v6_occupation_group_name'
'lot_v6_career_area'
'lot_v6_career_area_name'
'soc_2_name'
'soc_3_name'
'soc_4'
'soc_4_name'
'soc_5_name'
'lightcast_sectors'
'lightcast_sectors_name'
'naics_2022_2'
'naics_2022_2_name'
'naics_2022_3'
'naics_2022_3_name'
'naics_2022_4'
'naics_2022_4_name'
'naics_2022_5'
'naics_2022_5_name'
'naics_2022_6'
'naics_2022_6_name'

missing_percentage = df.isnull().mean() * 100
print(missing_percentage)
last_updated_date     0.030346
posted                0.030346
expired              10.819609
duration             37.678281
source_types          0.030346
                       ...    
naics_2022_4_name     0.060691
naics_2022_5          0.060691
naics_2022_5_name     0.060691
naics_2022_6          0.060691
naics_2022_6_name     0.060691
Length: 118, dtype: float64

# Fill missing values
df["min_years_experience"].fillna(df["min_years_experience"].median(), inplace=True)
df["lightcast_sectors_name"].fillna("Unknown", inplace=True)
for col in df.columns:
    print(repr(col))
'last_updated_date'
'posted'
'expired'
'duration'
'source_types'
'sources'
'active_sources_info'
'title_raw'
'body'
'modeled_expired'
'modeled_duration'
'company'
'company_name'
'company_raw'
'company_is_staffing'
'education_levels'
'education_levels_name'
'min_edulevels'
'min_edulevels_name'
'max_edulevels'
'max_edulevels_name'
'employment_type'
'employment_type_name'
'min_years_experience'
'max_years_experience'
'is_internship'
'salary'
'remote_type'
'remote_type_name'
'original_pay_period'
'salary_to'
'salary_from'
'location'
'city'
'city_name'
'county'
'county_name'
'msa'
'msa_name'
'state'
'state_name'
'county_outgoing'
'county_name_outgoing'
'county_incoming'
'county_name_incoming'
'msa_outgoing'
'msa_name_outgoing'
'msa_incoming'
'msa_name_incoming'
'naics2_name'
'naics3_name'
'naics4_name'
'naics5_name'
'naics6_name'
'title'
'title_name'
'title_clean'
'skills'
'skills_name'
'specialized_skills'
'specialized_skills_name'
'certifications'
'certifications_name'
'common_skills'
'common_skills_name'
'software_skills'
'software_skills_name'
'onet'
'onet_name'
'onet_2019'
'onet_2019_name'
'cip6'
'cip6_name'
'cip4'
'cip4_name'
'cip2'
'cip2_name'
'soc_2021_2'
'soc_2021_2_name'
'soc_2021_3'
'soc_2021_3_name'
'soc_2021_4'
'soc_2021_4_name'
'soc_2021_5'
'soc_2021_5_name'
'lot_career_area'
'lot_career_area_name'
'lot_occupation'
'lot_occupation_name'
'lot_specialized_occupation'
'lot_specialized_occupation_name'
'lot_occupation_group'
'lot_occupation_group_name'
'lot_v6_specialized_occupation'
'lot_v6_specialized_occupation_name'
'lot_v6_occupation'
'lot_v6_occupation_name'
'lot_v6_occupation_group'
'lot_v6_occupation_group_name'
'lot_v6_career_area'
'lot_v6_career_area_name'
'soc_2_name'
'soc_3_name'
'soc_4'
'soc_4_name'
'soc_5_name'
'lightcast_sectors'
'lightcast_sectors_name'
'naics_2022_2'
'naics_2022_2_name'
'naics_2022_3'
'naics_2022_3_name'
'naics_2022_4'
'naics_2022_4_name'
'naics_2022_5'
'naics_2022_5_name'
'naics_2022_6'
'naics_2022_6_name'
# Remove duplicates based on 'title', 'company_name', 'location', and 'posted'
df = df.drop_duplicates(subset=["title", "company_name", "location", "posted"], keep="first")
fig = px.bar(df["lightcast_sectors_name"].value_counts(), title="Job Postings by Industry")
fig.show()
fig = px.box(df, x="lightcast_sectors_name", y="salary", title="Salary Distribution by Industry")
fig.show()
fig = px.pie(df, names="remote_type_name", title="Remote vs. On-Site Jobs")
fig.show()
fig = px.box(df, x="education_levels_name", y="salary", title="Education Level vs. Salary")
fig.show()
fig = px.box(df, x="company_name", y="salary", title="Salary Distribution by Company")
fig.show()